package com.shujia.spark

import org.apache.spark.rdd.RDD
import org.apache.spark.storage.StorageLevel
import org.apache.spark.{SparkConf, SparkContext}

object Demo23Cache {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf()
      .setMaster("local")
      .setAppName("Demo23Cache")

    val sc = new SparkContext(conf)

    val studentsRDD: RDD[String] = sc.textFile("data/students.txt")


    //1、将数据拆分出来
    val studentTupleRDD: RDD[(String, String, Int, String, String)] = studentsRDD.map(student => {

      println("studentTupleRDD")

      val split: Array[String] = student.split(",")
      (split(0), split(1), split(2).toInt, split(3), split(4))
    })

    /**
      * 当对同一个rdd多次使用的时候可以将这个rdd缓存起来
      *
      */

    //ccache的缓存级别是MEMORy_PNLY
    //studentTupleRDD.cache()

    //使用其它的缓存级别
    studentTupleRDD.persist(StorageLevel.MEMORY_ONLY_SER)


    /**
      * 统计班级的人数
      *
      */

    //1、取出班级
    val clazzRDD: RDD[(String, Int)] = studentTupleRDD.map {
      ///没有使用的列可以使用下划线占位
      case (_: String, _: String, _: Int, _: String, clazz: String) =>
        (clazz, 1)
    }

    //统计人数
    val clazzNumRDD: RDD[(String, Int)] = clazzRDD.reduceByKey((x, y) => x + y)

    clazzNumRDD.foreach(println)

    /**
      * 统计性别的人数
      *
      */

    //1、取出班级
    val genderRDD: RDD[(String, Int)] = studentTupleRDD.map {
      ///没有使用的列可以使用下划线占位
      case (_: String, _: String, _: Int, gender: String, _: String) =>
        (gender, 1)
    }

    //统计人数
    val genderNumRDD: RDD[(String, Int)] = genderRDD.reduceByKey((x, y) => x + y)

    genderNumRDD.foreach(println)


    while (true) {

    }

  }

}
